#include "saber/funcs/impl/arm/neon/impl/conv_arm_impl.h"

#ifdef USE_ARM_PLACE

namespace anakin{

namespace saber{

void conv_depthwise_5x5s2p2(const float* din, float* dout, \
        int num, int ch_out, int h_out, int w_out, \
        int ch_in, int h_in, int w_in, \
        const float* weights, const float* bias, \
        bool flag_bias, bool flag_relu, Context<ARM>* ctx);

void conv_depthwise_5x5s2p2_relu(const float* din, float* dout, \
        int num, int ch_out, int h_out, int w_out, \
        int ch_in, int h_in, int w_in, \
        const float* weights, const float* bias, \
        bool flag_bias, bool flag_relu, Context<ARM>* ctx);

void conv_depthwise_5x5s2p2_s(const float* din, float* dout, \
        int num, int ch_out, int h_out, int w_out, \
        int ch_in, int h_in, int w_in, \
        const float* weights, const float* bias, \
        bool flag_bias, bool flag_relu, Context<ARM>* ctx);

void conv_depthwise_5x5s2p2_relu_s(const float* din, float* dout, \
        int num, int ch_out, int h_out, int w_out, \
        int ch_in, int h_in, int w_in, \
        const float* weights, const float* bias, \
        bool flag_bias, bool flag_relu, Context<ARM>* ctx);

void conv_depthwise_5x5s2(const float* din,float* dout, \
                        int num, int chout, int hout, int wout, \
                        int chin, int hin, int win, \
                        const float* weights, const float* bias, \
                        int pad, bool flag_bias, bool flag_relu, \
                        Context<ARM>* ctx) {

    if (pad == 2){
        if (win >= 9) {
            if (flag_relu) {
                conv_depthwise_5x5s2p2_relu(din, dout, num, chout, hout, wout, chin, hin, win, weights, \
                  bias, flag_bias, flag_relu, ctx);
            } else {
                conv_depthwise_5x5s2p2(din, dout, num, chout, hout, wout, chin, hin, win, weights, \
                  bias, flag_bias, flag_relu, ctx);
            }
        } else{
            if (flag_relu){
                conv_depthwise_5x5s2p2_relu_s(din, dout, \
                  num, chout, hout, wout, chin, hin, win, weights, bias, flag_bias, flag_relu, ctx);
            } else {
                conv_depthwise_5x5s2p2_s(din, dout, num, chout, hout, wout, chin, hin, win, weights, \
                  bias, flag_bias, flag_relu, ctx);
            }

        }
    }
}

#ifdef __aarch64__

//! larger depthwise, win >= 9;
void conv_depthwise_5x5s2p2(const float* din, float* dout, \
                  int num, int ch_out, int h_out, int w_out, \
                  int ch_in, int h_in, int w_in, \
                  const float* weights, const float* bias, \
                  bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_GE(w_in, 9) <<  "only support win >= 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int cnt = (w_out_round - 4) / 4;
    int mid_cnt = cnt - 1;
    int right_start = cnt * 2 * 4 - 2;
    int mask_cnt = 12 - (w_in - right_start);
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    float* write_ptr = zero_ptr + w_in;
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;
            const float* din5 = din4 + w_in;
            const float* din6 = din5 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights+ c * weights_saptial_size;
            for (int h = 0; h < h_out; h += 2){
                //! (h * 2 - 2) + 6 > h_in - 1
                if (h * 2 + 5 > h_in){
                    switch (h * 2 + 5 - h_in){
                        case 6:
                            din1 = zero_ptr;
                        case 5:
                            din2 = zero_ptr;
                        case 4:
                            din3 = zero_ptr;
                        case 3:
                            din4 = zero_ptr;
                        case 2:
                            din5 = zero_ptr;
                        case 1:
                            din6 = zero_ptr;
                        default:
                            break;
                    }
                }
                if (h + 2 > h_out){
                    switch (h + 2 - h_out){
                        case 1:
                            dout1 = write_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;
                const float* din_ptr5 = din5;
                const float* din_ptr6 = din6;

                const float* weights_ptr = weights_c;
                float* dout_ptr0 = dout0;
                float* dout_ptr1 = dout1;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                int loop = mid_cnt;
                const int s_8 = 8;
                const int s_16 = 16;

                //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
                //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
                //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
                //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
                //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
                //! out r0, r1 -- v26, v27
                asm volatile(
                    "movi   v31.4s, #0x0\n"
                    "prfm pldl1keep, [%[din_ptr0]]  \n"
                    "prfm pldl1keep, [%[din_ptr1]]  \n"
                    "prfm pldl1keep, [%[din_ptr2]]  \n"
                    "prfm pldl1keep, [%[din_ptr3]]  \n"
                    "prfm pldl1keep, [%[din_ptr4]]  \n"
                    "prfm pldl1keep, [%[din_ptr5]]  \n"
                    "prfm pldl1keep, [%[din_ptr6]]  \n"
                    "prfm pldl1keep, [%[weights]]   \n"
                    "prfm pldl1keep, [%[mask]]      \n"
                    // left
                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0 2 4 6, v7: 1 3 5 7
                    "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x 0 2 4
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11: 0 2 4 6, v12: 1 3 5 7
                    "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x 1 3 5
                    "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load weights 0-7
                    "ext v10.16b, v6.16b, v31.16b, #4                   \n"
                    "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10: 2 4 6 8
                    "sub %[din_ptr0], %[din_ptr0], #8                   \n"
                    "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13: x 0 2 4
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16: 0 2 4 6, v17: 1 3 5 7
                    "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14: x 1 3 5
                    "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  //load weights 8-15
                    "ext v15.16b, v11.16b, v31.16b, #4                  \n"
                    "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15: 2 4 6
                    "sub %[din_ptr1], %[din_ptr1], #8                   \n"
                    "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18: x 0 2 4
                    "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load weights 16-23
                    "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19: x 1 3 5
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21: 0 2 4 6, v22: 1 3 5 7
                    "ext v20.16b, v16.16b, v31.16b, #4                  \n"
                    "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20: 2 4 6 8
                    "sub %[din_ptr2], %[din_ptr2], #8                   \n"
                    "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23: x 0 2 4
                    "ld1 {v30.4s}, [%[weights]]                         \n"  // load weights 24
                    "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24: x 1 3 5
                    "ld1 {v26.4s}, [%[vbias]]                           \n"  // load bias to out_r0
                    "ext v25.16b, v21.16b, v31.16b, #4                  \n"
                    "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25: 2 4 6 8
                    "sub %[din_ptr3], %[din_ptr3], #8                   \n"
                    "mov v27.16b, v26.16b                               \n"  // load bias to out_r1
                    "mov v28.16b, v31.16b                               \n"  //load zero to out_r0
                    "mov v29.16b, v31.16b                               \n"  //load zero to out_r1

                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0: w0
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0: w1
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0: w2
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0: w3

                    "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0 v8: 0 2 4 6, v9: 1 3 5 7

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0: w4
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0: w5
                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0: w6
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0: w7

                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0 v6: 2 4 6 8, v7: 3 5 7 9

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0: w8
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0: w9
                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0: w10
                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0: w11

                    "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0 v10: 4 6 8 10, v11: trash register

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0: w12
                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0: w13
                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0: w14
                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0: w15
                    "prfm pldl1keep, [%[din_ptr0]]                      \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11: 0 2 4 6, v12: 1 3 5 7

                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0: w16
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0: w17

                    "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13: x 0 2 4
                    "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14: x 1 3 5
                    "ext v15.16b, v11.16b, v31.16b, #4                  \n"

                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0: w18
                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0: w19

                    "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15: 2 4 6

                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1: w0
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1: w1

                    "sub %[din_ptr4], %[din_ptr4], #8                   \n"

                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1: w2
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1: w3
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1: w4
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1: w5

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16: 0 2 4 6, v17: 1 3 5 7

                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1: w6
                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1: w7

                    "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18: x 0 2 4
                    "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19: x 1 3 5
                    "ext v20.16b, v16.16b, v31.16b, #4                  \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1: w8
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1: w9

                    "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20: 2 4 6
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21: 0 2 4 6, v22: 1 3 5 7

                    "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23: x 0 2 4
                    "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24: x 1 3 5
                    "ext v25.16b, v21.16b, v31.16b, #4                  \n"
                    "sub %[din_ptr5], %[din_ptr5], #8                   \n"


                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0: w22
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0: w23

                    "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25: 2 4 6

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0: w20
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0: w21

                    "sub %[din_ptr6], %[din_ptr6], #8                   \n"

                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0: w24
                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1: w10

                    "fadd v26.4s, v26.4s, v28.4s                        \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1: w11

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1 v13: 0 2 4 6, v14: 1 3 5 7
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1: w12
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1: w13

                    "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store output r0
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1 v11: 2 4 6 8, v12: 3 5 7 9

                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1: w14
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1: w17
                    "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1: w15
                    "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1: w16

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1 v15: 4 6 8 10, v16: trash register

                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1: w18
                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1: w19

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2 v18: 0 2 4 6, v19: 1 3 5 7
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2 v16: 2 4 6 8, v11: 3 5 7 9

                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1: w20
                    "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1: w22
                    "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1: w21
                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1: w23

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2 v20: 4 6 8 10, v21: trash register
                    "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3 v23: 0 2 4 6, v24: 1 3 5 7

                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1: w24

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3 v21: 2 4 6 8, v22: 3 5 7 9
                    "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3 v25: 4 6 8 10, v26: trash register

                    "fadd v27.4s, v27.4s, v29.4s                        \n"
                    "cmp %w[mid_cnt], #1                                \n"

                    "prfm pldl1keep, [%[din_ptr1]]                      \n"
                    "prfm pldl1keep, [%[din_ptr2]]                      \n"
                    "prfm pldl1keep, [%[din_ptr3]]                      \n"

                    "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
                    "blt 2f                                             \n"

                    // mid loop
                    "1:                                                 \n"
                    "ld1 {v26.4s}, [%[vbias]]                           \n"
                    "mov v27.16b, v26.16b                               \n"
                    "mov v28.16b, v31.16b                               \n"
                    "mov v29.16b, v31.16b                               \n"

                    // out_r0 r0-r3
                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"

                    "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"

                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"

                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"

                    "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr0]]                      \n"

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"
                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr4]]                      \n"

                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"
                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"

                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"
                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr5]]                      \n"

                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"
                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"
                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"

                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
                    "fadd v28.4s, v26.4s, v28.4s                        \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
                    "mov v26.16b, v31.16b                               \n"
                    "prfm pldl1keep, [%[din_ptr6]]                      \n"

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"
                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"

                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"

                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"
                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"
                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr1]]                      \n"

                    "fmla v29.4s, v18.4s, v3.s[3]                       \n"
                    "fmla v27.4s, v19.4s, v4.s[0]                       \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"

                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"

                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"
                    "fmla v27.4s, v21.4s, v5.s[2]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"

                    "fmla v29.4s, v24.4s, v5.s[1]                       \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
                    "prfm pldl1keep, [%[din_ptr2]]                      \n"

                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"

                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"

                    "fadd v26.4s, v26.4s, v28.4s                        \n"

                    "prfm pldl1keep, [%[din_ptr3]]                      \n"

                    "fadd v27.4s, v27.4s, v29.4s                        \n"

                    "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
                    "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
                    "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
                    "bne 1b                                             \n"

                    "2:                                                 \n"
                    "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
                    "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
                    "bif v8.16b, v31.16b, v26.16b                       \n"
                    "bif v9.16b, v31.16b, v27.16b                       \n"
                    "bif v6.16b, v31.16b, v28.16b                       \n"
                    "bif v7.16b, v31.16b, v29.16b                       \n"

                    "bif v13.16b, v31.16b, v26.16b                      \n"
                    "bif v14.16b, v31.16b, v27.16b                      \n"
                    "bif v11.16b, v31.16b, v28.16b                      \n"
                    "bif v12.16b, v31.16b, v29.16b                      \n"

                    "bif v18.16b, v31.16b, v26.16b                      \n"
                    "bif v19.16b, v31.16b, v27.16b                      \n"
                    "bif v16.16b, v31.16b, v28.16b                      \n"
                    "bif v17.16b, v31.16b, v29.16b                      \n"

                    "bif v23.16b, v31.16b, v26.16b                      \n"
                    "bif v24.16b, v31.16b, v27.16b                      \n"
                    "bif v21.16b, v31.16b, v28.16b                      \n"
                    "bif v22.16b, v31.16b, v29.16b                      \n"

                    "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
                    "ld1 {v26.4s}, [%[vbias]]                           \n"
                    "mov v29.16b, v31.16b                               \n"

                    "bif v10.16b, v31.16b, v28.16b                      \n"
                    "bif v15.16b, v31.16b, v28.16b                      \n"

                    "mov v27.16b, v26.16b                               \n"

                    "bif v20.16b, v31.16b, v28.16b                      \n"
                    "bif v25.16b, v31.16b, v28.16b                      \n"
                    "mov v28.16b, v31.16b                               \n"

                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"
                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"

                    "sub %[mask], %[mask], #16                          \n"
                    "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
                    "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
                    "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"
                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"

                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"
                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"
                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"

                    "bif  v13.16b, v31.16b, v6.16b                      \n"
                    "bif  v14.16b, v31.16b, v7.16b                      \n"
                    "bif  v11.16b, v31.16b, v8.16b                      \n"
                    "bif  v12.16b, v31.16b, v9.16b                      \n"
                    "bif  v15.16b, v31.16b, v10.16b                     \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"

                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"
                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"
                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"

                    "bif  v18.16b, v31.16b, v6.16b                      \n"
                    "bif  v19.16b, v31.16b, v7.16b                      \n"
                    "bif  v16.16b, v31.16b, v8.16b                      \n"
                    "bif  v17.16b, v31.16b, v9.16b                      \n"
                    "bif  v20.16b, v31.16b, v10.16b                     \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"
                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"
                    "fadd v28.4s, v28.4s, v26.4s                        \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
                    "mov v26.16b, v31.16b                               \n"

                    "bif  v23.16b, v31.16b, v6.16b                      \n"
                    "bif  v24.16b, v31.16b, v7.16b                      \n"
                    "bif  v21.16b, v31.16b, v8.16b                      \n"
                    "bif  v22.16b, v31.16b, v9.16b                      \n"
                    "bif  v25.16b, v31.16b, v10.16b                     \n"

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"
                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"
                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"

                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"

                    "fadd v26.4s, v26.4s, v28.4s                        \n"
                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"
                    "fmla v29.4s, v18.4s, v3.s[3]                       \n"
                    "fmla v27.4s, v19.4s, v4.s[0]                       \n"
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"

                    "st1 {v26.4s}, [%[out_buf0]]                        \n"
                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"
                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"
                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"
                    "fmla v29.4s, v24.4s, v5.s[1]                       \n"

                    "fmla v27.4s, v21.4s, v5.s[2]                       \n"
                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"
                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"
                    "fadd v27.4s, v27.4s, v29.4s                        \n"

                    "st1 {v27.4s}, [%[out_buf1]]                        \n"

                : [dout_ptr0] "+r" (dout_ptr0), [dout_ptr1] "+r" (dout_ptr1), [mid_cnt] "+r" (loop),  \
                  [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), \
                  [din_ptr5] "+r" (din_ptr5), [din_ptr6] "+r" (din_ptr6), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [out_buf1] "r" (out_buf1), [s_8] "r" (s_8), \
                  [s_16] "r" (s_16)
                : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
                  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
                );

                int remain_cnt = w_out - (mid_cnt + 1) * 4;
                for (int i = 0; i < remain_cnt; ++i){
                    dout_ptr0[i] = out_buf0[i];
                    dout_ptr1[i] = out_buf1[i];
                }
                din0 = din4;
                din1 = din5;
                din2 = din6;
                din3 = din6 + w_in;
                din4 = din3 + w_in;
                din5 = din4 + w_in;
                din6 = din5 + w_in;
                dout0 = dout1 + w_out;
                dout1 = dout0 + w_out;
            }
        }
    }
}


//! larger depthwise, win >= 9;
void conv_depthwise_5x5s2p2_relu(const float* din, float* dout, \
          int num, int ch_out, int h_out, int w_out, \
          int ch_in, int h_in, int w_in, \
          const float* weights, const float* bias, \
          bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_GE(w_in, 9) << "only support win >= 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int cnt = (w_out_round - 4) / 4;
    int mid_cnt = cnt - 1;
    int right_start = cnt * 2 * 4 - 2;
    int mask_cnt = 12 - (w_in - right_start);
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    float* write_ptr = zero_ptr + w_in;
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;

#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;
            const float* din5 = din4 + w_in;
            const float* din6 = din5 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights + c * weights_saptial_size;
            for (int h = 0; h < h_out; h += 2){
                //! (h * 2 - 2) + 6 > h_in - 1
                if (h * 2 + 5 > h_in){
                    switch (h * 2 + 5 - h_in){
                        case 6:
                            din1 = zero_ptr;
                        case 5:
                            din2 = zero_ptr;
                        case 4:
                            din3 = zero_ptr;
                        case 3:
                            din4 = zero_ptr;
                        case 2:
                            din5 = zero_ptr;
                        case 1:
                            din6 = zero_ptr;
                        default:
                            break;
                    }
                }
                if (h + 2 > h_out){
                    switch (h + 2 - h_out){
                        case 1:
                            dout1 = write_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;
                const float* din_ptr5 = din5;
                const float* din_ptr6 = din6;

                const float* weights_ptr = weights_c;
                float* dout_ptr0 = dout0;
                float* dout_ptr1 = dout1;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                int loop = mid_cnt;
                const int s_8 = 8;
                const int s_16 = 16;

                //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
                //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
                //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
                //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
                //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
                //! out r0, r1 -- v26, v27
                asm volatile(
                    "movi   v31.4s, #0x0\n"
                    "prfm pldl1keep, [%[din_ptr0]]  \n"
                    "prfm pldl1keep, [%[din_ptr1]]  \n"
                    "prfm pldl1keep, [%[din_ptr2]]  \n"
                    "prfm pldl1keep, [%[din_ptr3]]  \n"
                    "prfm pldl1keep, [%[din_ptr4]]  \n"
                    "prfm pldl1keep, [%[din_ptr5]]  \n"
                    "prfm pldl1keep, [%[din_ptr6]]  \n"
                    "prfm pldl1keep, [%[weights]]   \n"
                    "prfm pldl1keep, [%[mask]]      \n"
                    // left
                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0 2 4 6, v7: 1 3 5 7
                    "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x 0 2 4
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11: 0 2 4 6, v12: 1 3 5 7
                    "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x 1 3 5
                    "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load weights 0-7
                    "ext v10.16b, v6.16b, v31.16b, #4                   \n"
                    "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10: 2 4 6 8
                    "sub %[din_ptr0], %[din_ptr0], #8                   \n"
                    "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13: x 0 2 4
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16: 0 2 4 6, v17: 1 3 5 7
                    "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14: x 1 3 5
                    "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  //load weights 8-15
                    "ext v15.16b, v11.16b, v31.16b, #4                  \n"
                    "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15: 2 4 6
                    "sub %[din_ptr1], %[din_ptr1], #8                   \n"
                    "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18: x 0 2 4
                    "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load weights 16-23
                    "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19: x 1 3 5
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21: 0 2 4 6, v22: 1 3 5 7
                    "ext v20.16b, v16.16b, v31.16b, #4                  \n"
                    "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20: 2 4 6 8
                    "sub %[din_ptr2], %[din_ptr2], #8                   \n"
                    "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23: x 0 2 4
                    "ld1 {v30.4s}, [%[weights]]                         \n"  // load weights 24
                    "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24: x 1 3 5
                    "ld1 {v26.4s}, [%[vbias]]                           \n"  // load bias to out_r0
                    "ext v25.16b, v21.16b, v31.16b, #4                  \n"
                    "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25: 2 4 6 8
                    "sub %[din_ptr3], %[din_ptr3], #8                   \n"
                    "mov v27.16b, v26.16b                               \n"  // load bias to out_r1
                    "mov v28.16b, v31.16b                               \n"  //load zero to out_r0
                    "mov v29.16b, v31.16b                               \n"  //load zero to out_r1

                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0: w0
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0: w1
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0: w2
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0: w3

                    "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0 v8: 0 2 4 6, v9: 1 3 5 7

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0: w4
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0: w5
                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0: w6
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0: w7

                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0 v6: 2 4 6 8, v7: 3 5 7 9

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0: w8
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0: w9
                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0: w10
                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0: w11

                    "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0 v10: 4 6 8 10, v11: trash register

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0: w12
                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0: w13
                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0: w14
                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0: w15
                    "prfm pldl1keep, [%[din_ptr0]]                      \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11: 0 2 4 6, v12: 1 3 5 7

                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0: w16
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0: w17

                    "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13: x 0 2 4
                    "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14: x 1 3 5
                    "ext v15.16b, v11.16b, v31.16b, #4                  \n"

                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0: w18
                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0: w19

                    "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15: 2 4 6

                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1: w0
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1: w1

                    "sub %[din_ptr4], %[din_ptr4], #8                   \n"

                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1: w2
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1: w3
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1: w4
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1: w5

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16: 0 2 4 6, v17: 1 3 5 7

                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1: w6
                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1: w7

                    "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18: x 0 2 4
                    "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19: x 1 3 5
                    "ext v20.16b, v16.16b, v31.16b, #4                  \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1: w8
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1: w9

                    "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20: 2 4 6
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21: 0 2 4 6, v22: 1 3 5 7

                    "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23: x 0 2 4
                    "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24: x 1 3 5
                    "ext v25.16b, v21.16b, v31.16b, #4                  \n"
                    "sub %[din_ptr5], %[din_ptr5], #8                   \n"

                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0: w22
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0: w23

                    "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25: 2 4 6

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0: w20
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0: w21

                    "sub %[din_ptr6], %[din_ptr6], #8                   \n"

                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0: w24
                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1: w10

                    "fadd v26.4s, v26.4s, v28.4s                        \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1: w11
                    "fmax v26.4s, v26.4s, v31.4s                        \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1 v13: 0 2 4 6, v14: 1 3 5 7
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1: w12
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1: w13

                    "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store output r0
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1 v11: 2 4 6 8, v12: 3 5 7 9

                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1: w14
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1: w17
                    "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1: w15
                    "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1: w16

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1 v15: 4 6 8 10, v16: trash register

                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1: w18
                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1: w19

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2 v18: 0 2 4 6, v19: 1 3 5 7
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2 v16: 2 4 6 8, v11: 3 5 7 9

                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1: w20
                    "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1: w22
                    "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1: w21
                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1: w23

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2 v20: 4 6 8 10, v21: trash register
                    "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3 v23: 0 2 4 6, v24: 1 3 5 7

                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1: w24

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3 v21: 2 4 6 8, v22: 3 5 7 9
                    "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3 v25: 4 6 8 10, v26: trash register

                    "fadd v27.4s, v27.4s, v29.4s                        \n"
                    "fmax v27.4s, v27.4s, v31.4s                        \n"
                    "cmp %w[mid_cnt], #1                                \n"
                    "prfm pldl1keep, [%[din_ptr1]]                      \n"
                    "prfm pldl1keep, [%[din_ptr2]]                      \n"
                    "prfm pldl1keep, [%[din_ptr3]]                      \n"
                    "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
                    "blt 2f                                             \n"

                    // mid loop
                    "1:                                                 \n"
                    "ld1 {v26.4s}, [%[vbias]]                           \n"
                    "mov v27.16b, v26.16b                               \n"
                    "mov v28.16b, v31.16b                               \n"
                    "mov v29.16b, v31.16b                               \n"

                    // out_r0 r0-r3
                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"

                    "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"

                    "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"

                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"

                    "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr0]]                      \n"

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"
                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr4]]                      \n"

                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"
                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"

                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"
                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr5]]                      \n"

                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"
                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"
                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"

                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
                    "fadd v28.4s, v26.4s, v28.4s                        \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
                    "mov v26.16b, v31.16b                               \n"
                    "prfm pldl1keep, [%[din_ptr6]]                      \n"

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"
                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"

                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"

                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"
                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"
                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
                    "prfm pldl1keep, [%[din_ptr1]]                      \n"

                    "fmla v29.4s, v18.4s, v3.s[3]                       \n"
                    "fmla v27.4s, v19.4s, v4.s[0]                       \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"

                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"

                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"
                    "fmla v27.4s, v21.4s, v5.s[2]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"

                    "fmla v29.4s, v24.4s, v5.s[1]                       \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
                    "prfm pldl1keep, [%[din_ptr2]]                      \n"

                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"

                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"

                    "fadd v26.4s, v26.4s, v28.4s                        \n"
                    "fadd v27.4s, v27.4s, v29.4s                        \n"
                    "fmax v26.4s, v26.4s, v31.4s                        \n"
                    "fmax v27.4s, v27.4s, v31.4s                        \n"

                    "prfm pldl1keep, [%[din_ptr3]]                      \n"
                    "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
                    "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
                    "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
                    "bne 1b                                             \n"

                    "2:                                                 \n"
                    "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
                    "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
                    "bif v8.16b, v31.16b, v26.16b                       \n"
                    "bif v9.16b, v31.16b, v27.16b                       \n"
                    "bif v6.16b, v31.16b, v28.16b                       \n"
                    "bif v7.16b, v31.16b, v29.16b                       \n"

                    "bif v13.16b, v31.16b, v26.16b                      \n"
                    "bif v14.16b, v31.16b, v27.16b                      \n"
                    "bif v11.16b, v31.16b, v28.16b                      \n"
                    "bif v12.16b, v31.16b, v29.16b                      \n"

                    "bif v18.16b, v31.16b, v26.16b                      \n"
                    "bif v19.16b, v31.16b, v27.16b                      \n"
                    "bif v16.16b, v31.16b, v28.16b                      \n"
                    "bif v17.16b, v31.16b, v29.16b                      \n"

                    "bif v23.16b, v31.16b, v26.16b                      \n"
                    "bif v24.16b, v31.16b, v27.16b                      \n"
                    "bif v21.16b, v31.16b, v28.16b                      \n"
                    "bif v22.16b, v31.16b, v29.16b                      \n"

                    "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
                    "ld1 {v26.4s}, [%[vbias]]                           \n"
                    "mov v29.16b, v31.16b                               \n"

                    "bif v10.16b, v31.16b, v28.16b                      \n"
                    "bif v15.16b, v31.16b, v28.16b                      \n"

                    "mov v27.16b, v26.16b                               \n"

                    "bif v20.16b, v31.16b, v28.16b                      \n"
                    "bif v25.16b, v31.16b, v28.16b                      \n"
                    "mov v28.16b, v31.16b                               \n"

                    "fmla v26.4s, v8.4s, v0.s[0]                        \n"
                    "fmla v28.4s, v9.4s, v0.s[1]                        \n"
                    "fmla v26.4s, v6.4s, v0.s[2]                        \n"
                    "fmla v28.4s, v7.4s, v0.s[3]                        \n"

                    "fmla v26.4s, v10.4s, v1.s[0]                       \n"
                    "fmla v28.4s, v13.4s, v1.s[1]                       \n"
                    "fmla v26.4s, v14.4s, v1.s[2]                       \n"
                    "fmla v28.4s, v11.4s, v1.s[3]                       \n"

                    "sub %[mask], %[mask], #16                          \n"
                    "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
                    "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
                    "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"

                    "fmla v26.4s, v12.4s, v2.s[0]                       \n"
                    "fmla v28.4s, v15.4s, v2.s[1]                       \n"

                    "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v26.4s, v16.4s, v3.s[0]                       \n"
                    "fmla v28.4s, v17.4s, v3.s[1]                       \n"

                    "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"

                    "fmla v27.4s, v16.4s, v0.s[2]                       \n"
                    "fmla v29.4s, v17.4s, v0.s[3]                       \n"

                    "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"

                    "fmla v26.4s, v18.4s, v2.s[2]                       \n"
                    "fmla v28.4s, v19.4s, v2.s[3]                       \n"
                    "fmla v27.4s, v18.4s, v0.s[0]                       \n"
                    "fmla v29.4s, v19.4s, v0.s[1]                       \n"

                    "bif  v13.16b, v31.16b, v6.16b                      \n"
                    "bif  v14.16b, v31.16b, v7.16b                      \n"
                    "bif  v11.16b, v31.16b, v8.16b                      \n"
                    "bif  v12.16b, v31.16b, v9.16b                      \n"
                    "bif  v15.16b, v31.16b, v10.16b                     \n"

                    "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v26.4s, v20.4s, v3.s[2]                       \n"
                    "fmla v27.4s, v20.4s, v1.s[0]                       \n"

                    "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"

                    "fmla v29.4s, v21.4s, v1.s[3]                       \n"
                    "fmla v28.4s, v21.4s, v4.s[1]                       \n"

                    "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"

                    "fmla v28.4s, v23.4s, v3.s[3]                       \n"
                    "fmla v29.4s, v23.4s, v1.s[1]                       \n"
                    "fmla v27.4s, v24.4s, v1.s[2]                       \n"
                    "fmla v26.4s, v24.4s, v4.s[0]                       \n"

                    "bif  v18.16b, v31.16b, v6.16b                      \n"
                    "bif  v19.16b, v31.16b, v7.16b                      \n"
                    "bif  v16.16b, v31.16b, v8.16b                      \n"
                    "bif  v17.16b, v31.16b, v9.16b                      \n"
                    "bif  v20.16b, v31.16b, v10.16b                     \n"

                    "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v27.4s, v22.4s, v2.s[0]                       \n"
                    "fmla v26.4s, v22.4s, v4.s[2]                       \n"

                    "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"

                    "fmla v28.4s, v25.4s, v4.s[3]                       \n"
                    "fmla v29.4s, v25.4s, v2.s[1]                       \n"
                    "fadd v28.4s, v28.4s, v26.4s                        \n"

                    "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
                    "mov v26.16b, v31.16b                               \n"

                    "bif  v23.16b, v31.16b, v6.16b                      \n"
                    "bif  v24.16b, v31.16b, v7.16b                      \n"
                    "bif  v21.16b, v31.16b, v8.16b                      \n"
                    "bif  v22.16b, v31.16b, v9.16b                      \n"
                    "bif  v25.16b, v31.16b, v10.16b                     \n"

                    "fmla v26.4s, v13.4s, v5.s[0]                       \n"
                    "fmla v28.4s, v14.4s, v5.s[1]                       \n"
                    "fmla v26.4s, v11.4s, v5.s[2]                       \n"
                    "fmla v28.4s, v12.4s, v5.s[3]                       \n"
                    "fmla v26.4s, v15.4s, v30.s[0]                      \n"

                    "fmla v27.4s, v13.4s, v2.s[2]                       \n"
                    "fmla v29.4s, v14.4s, v2.s[3]                       \n"
                    "fmla v27.4s, v11.4s, v3.s[0]                       \n"
                    "fmla v29.4s, v12.4s, v3.s[1]                       \n"

                    "fadd v26.4s, v26.4s, v28.4s                        \n"
                    "fmla v27.4s, v15.4s, v3.s[2]                       \n"
                    "fmla v29.4s, v18.4s, v3.s[3]                       \n"
                    "fmla v27.4s, v19.4s, v4.s[0]                       \n"
                    "fmla v29.4s, v16.4s, v4.s[1]                       \n"

                    "fmax v26.4s, v26.4s, v31.4s                        \n"
                    "fmla v27.4s, v17.4s, v4.s[2]                       \n"
                    "fmla v29.4s, v20.4s, v4.s[3]                       \n"
                    "fmla v27.4s, v23.4s, v5.s[0]                       \n"
                    "fmla v29.4s, v24.4s, v5.s[1]                       \n"

                    "st1 {v26.4s}, [%[out_buf0]]                        \n"
                    "fmla v27.4s, v21.4s, v5.s[2]                       \n"
                    "fmla v29.4s, v22.4s, v5.s[3]                       \n"
                    "fmla v27.4s, v25.4s, v30.s[0]                      \n"
                    "fadd v27.4s, v27.4s, v29.4s                        \n"

                    "fmax v27.4s, v27.4s, v31.4s                        \n"
                    "st1 {v27.4s}, [%[out_buf1]]                        \n"

                : [dout_ptr0] "+r" (dout_ptr0), [dout_ptr1] "+r" (dout_ptr1), [mid_cnt] "+r" (loop),  \
                  [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), \
                  [din_ptr5] "+r" (din_ptr5), [din_ptr6] "+r" (din_ptr6), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [out_buf1] "r" (out_buf1), [s_8] "r" (s_8), \
                  [s_16] "r" (s_16)
                : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
                  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
                );

                int remain_cnt = w_out - (mid_cnt + 1) * 4;
                for (int i = 0; i < remain_cnt; ++i){
                    dout_ptr0[i] = out_buf0[i];
                    dout_ptr1[i] = out_buf1[i];
                }
                din0 = din4;
                din1 = din5;
                din2 = din6;
                din3 = din6 + w_in;
                din4 = din3 + w_in;
                din5 = din4 + w_in;
                din6 = din5 + w_in;
                dout0 = dout1 + w_out;
                dout1 = dout0 + w_out;
            }
        }
    }
}

//! small depthwise, win < 9;
void conv_depthwise_5x5s2p2_s(const float* din, float* dout, \
          int num, int ch_out, int h_out, int w_out, \
          int ch_in, int h_in, int w_in, \
          const float* weights, const float* bias, \
          bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_LT(w_in, 9) << "only support win < 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int mask_cnt = 12 - w_in - 2;
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights + c * weights_saptial_size;
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }

                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                const int s_8 = 8;
                //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
                //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
                //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
                //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
                //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
                //! out r0 -- v4
                asm volatile(
                    "movi   v31.4s, #0x0\n"
                    "prfm pldl1keep, [%[din_ptr0]]  \n"
                    "prfm pldl1keep, [%[din_ptr1]]  \n"
                    "prfm pldl1keep, [%[din_ptr2]]  \n"
                    "prfm pldl1keep, [%[din_ptr3]]  \n"
                    "prfm pldl1keep, [%[din_ptr4]]  \n"
                    "prfm pldl1keep, [%[weights]]   \n"
                    "prfm pldl1keep, [%[mask]]      \n"

                    //! load mask
                    "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
                    "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
                    "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"

                    //! load and extract input
                    "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
                    "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"

                    "ext v8.16b, v31.16b, v6.16b, #12  \n"
                    "ext v9.16b, v31.16b, v7.16b, #12  \n"
                    "ext v13.16b, v31.16b, v11.16b, #12  \n"
                    "ext v14.16b, v31.16b, v12.16b, #12  \n"

                    "ext v18.16b, v31.16b, v16.16b, #12  \n"
                    "ext v19.16b, v31.16b, v17.16b, #12  \n"
                    "ext v23.16b, v31.16b, v21.16b, #12  \n"
                    "ext v24.16b, v31.16b, v22.16b, #12  \n"
                    "ext v28.16b, v31.16b, v26.16b, #12  \n"
                    "ext v29.16b, v31.16b, v27.16b, #12  \n"

                    "ext v10.16b, v6.16b,  v31.16b, #4  \n"
                    "ext v15.16b, v11.16b, v31.16b, #4  \n"
                    "ext v20.16b, v16.16b, v31.16b, #4  \n"
                    "ext v25.16b, v21.16b, v31.16b, #4  \n"
                    "ext v30.16b, v26.16b, v31.16b, #4  \n"

                    "bif v8.16b, v31.16b, v0.16b  \n"
                    "bif v9.16b, v31.16b, v1.16b  \n"
                    "bif v6.16b, v31.16b, v2.16b  \n"
                    "bif v7.16b, v31.16b, v3.16b  \n"

                    "bif v13.16b, v31.16b, v0.16b  \n"
                    "bif v14.16b, v31.16b, v1.16b  \n"
                    "bif v11.16b, v31.16b, v2.16b  \n"
                    "bif v12.16b, v31.16b, v3.16b  \n"

                    "bif v18.16b, v31.16b, v0.16b  \n"
                    "bif v19.16b, v31.16b, v1.16b  \n"
                    "bif v16.16b, v31.16b, v2.16b  \n"
                    "bif v17.16b, v31.16b, v3.16b  \n"

                    "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
                    "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
                    "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
                    "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
                    "ld1 {v30.s}[3], [%[din_ptr4]]  \n"

                    "bif v23.16b, v31.16b, v0.16b  \n"
                    "bif v24.16b, v31.16b, v1.16b  \n"
                    "bif v21.16b, v31.16b, v2.16b  \n"
                    "bif v22.16b, v31.16b, v3.16b  \n"

                    "bif v28.16b, v31.16b, v0.16b  \n"
                    "bif v29.16b, v31.16b, v1.16b  \n"
                    "bif v26.16b, v31.16b, v2.16b  \n"
                    "bif v27.16b, v31.16b, v3.16b  \n"

                    "bif v10.16b, v31.16b, v4.16b  \n"
                    "bif v15.16b, v31.16b, v4.16b  \n"
                    "bif v20.16b, v31.16b, v4.16b  \n"
                    "bif v25.16b, v31.16b, v4.16b  \n"
                    "bif v30.16b, v31.16b, v4.16b  \n"

                    "ld1 {v4.4s}, [%[vbias]]  \n"
                    "mov v5.16b, v31.16b  \n"

                    "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"   // load weights 0-7
                    "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"   // load weights 8-15

                    //! compute
                    "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
                    "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
                    "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
                    "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3

                    "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
                    "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
                    "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
                    "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7

                    "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"    // load weights 16-23
                    "ld1 {v8.s}[0], [%[weights]]  \n"              // load weights 24

                    "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
                    "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
                    "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
                    "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11

                    "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
                    "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
                    "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
                    "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15

                    "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
                    "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
                    "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
                    "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19

                    "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
                    "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
                    "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
                    "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
                    "fmla v4.4s, v30.4s, v8.s[0] \n"   // out r0: w24

                    "fadd v4.4s, v4.4s, v5.4s  \n"    // add out to v4
                    "st1 {v4.4s}, [%[out_buf0]]  \n"

                : [dout_ptr0] "+r" (dout_ptr0), [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), \
                  [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [out_buf1] "r" (out_buf1), [s_8] "r" (s_8)
                : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
                  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
                );
                for (int i = 0; i < w_out; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }
                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}

//! small depthwise, win < 9;
void conv_depthwise_5x5s2p2_relu_s(const float* din, float* dout, \
  int num, int ch_out, int h_out, int w_out, \
  int ch_in, int h_in, int w_in, \
  const float* weights, const float* bias, \
  bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_LT(w_in, 9) << "only support win < 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int mask_cnt = 12 - w_in - 2;
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights + c * weights_saptial_size;
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                const int s_8 = 8;
                //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
                //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
                //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
                //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
                //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
                //! out r0 -- v4
                asm volatile(
                    "movi   v31.4s, #0x0\n"
                    "prfm pldl1keep, [%[din_ptr0]]  \n"
                    "prfm pldl1keep, [%[din_ptr1]]  \n"
                    "prfm pldl1keep, [%[din_ptr2]]  \n"
                    "prfm pldl1keep, [%[din_ptr3]]  \n"
                    "prfm pldl1keep, [%[din_ptr4]]  \n"
                    "prfm pldl1keep, [%[weights]]   \n"
                    "prfm pldl1keep, [%[mask]]      \n"

                    //! load mask
                    "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
                    "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
                    "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"

                    //! load and extract input
                    "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
                    "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
                    "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
                    "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
                    "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"

                    "ext v8.16b, v31.16b, v6.16b, #12  \n"
                    "ext v9.16b, v31.16b, v7.16b, #12  \n"
                    "ext v13.16b, v31.16b, v11.16b, #12  \n"
                    "ext v14.16b, v31.16b, v12.16b, #12  \n"

                    "ext v18.16b, v31.16b, v16.16b, #12  \n"
                    "ext v19.16b, v31.16b, v17.16b, #12  \n"
                    "ext v23.16b, v31.16b, v21.16b, #12  \n"
                    "ext v24.16b, v31.16b, v22.16b, #12  \n"
                    "ext v28.16b, v31.16b, v26.16b, #12  \n"
                    "ext v29.16b, v31.16b, v27.16b, #12  \n"

                    "ext v10.16b, v6.16b,  v31.16b, #4  \n"
                    "ext v15.16b, v11.16b, v31.16b, #4  \n"
                    "ext v20.16b, v16.16b, v31.16b, #4  \n"
                    "ext v25.16b, v21.16b, v31.16b, #4  \n"
                    "ext v30.16b, v26.16b, v31.16b, #4  \n"

                    "bif v8.16b, v31.16b, v0.16b  \n"
                    "bif v9.16b, v31.16b, v1.16b  \n"
                    "bif v6.16b, v31.16b, v2.16b  \n"
                    "bif v7.16b, v31.16b, v3.16b  \n"

                    "bif v13.16b, v31.16b, v0.16b  \n"
                    "bif v14.16b, v31.16b, v1.16b  \n"
                    "bif v11.16b, v31.16b, v2.16b  \n"
                    "bif v12.16b, v31.16b, v3.16b  \n"

                    "bif v18.16b, v31.16b, v0.16b  \n"
                    "bif v19.16b, v31.16b, v1.16b  \n"
                    "bif v16.16b, v31.16b, v2.16b  \n"
                    "bif v17.16b, v31.16b, v3.16b  \n"

                    "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
                    "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
                    "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
                    "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
                    "ld1 {v30.s}[3], [%[din_ptr4]]  \n"

                    "bif v23.16b, v31.16b, v0.16b  \n"
                    "bif v24.16b, v31.16b, v1.16b  \n"
                    "bif v21.16b, v31.16b, v2.16b  \n"
                    "bif v22.16b, v31.16b, v3.16b  \n"

                    "bif v28.16b, v31.16b, v0.16b  \n"
                    "bif v29.16b, v31.16b, v1.16b  \n"
                    "bif v26.16b, v31.16b, v2.16b  \n"
                    "bif v27.16b, v31.16b, v3.16b  \n"

                    "bif v10.16b, v31.16b, v4.16b  \n"
                    "bif v15.16b, v31.16b, v4.16b  \n"
                    "bif v20.16b, v31.16b, v4.16b  \n"
                    "bif v25.16b, v31.16b, v4.16b  \n"
                    "bif v30.16b, v31.16b, v4.16b  \n"

                    "ld1 {v4.4s}, [%[vbias]]  \n"
                    "mov v5.16b, v31.16b  \n"

                    "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"   // load weights 0-7
                    "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"   // load weights 8-15

                    //! compute
                    "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
                    "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
                    "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
                    "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3

                    "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
                    "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
                    "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
                    "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7

                    "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"    // load weights 16-23
                    "ld1 {v8.s}[0], [%[weights]]  \n"              // load weights 24

                    "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
                    "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
                    "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
                    "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11

                    "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
                    "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
                    "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
                    "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15

                    "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
                    "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
                    "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
                    "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19

                    "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
                    "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
                    "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
                    "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
                    "fmla v4.4s, v30.4s, v8.s[0]  \n"  // out r0: w24

                    "fadd v4.4s, v4.4s, v5.4s     \n"  // add out to v4
                    "fmax v4.4s, v4.4s, v31.4s    \n"
                    "st1 {v4.4s}, [%[out_buf0]]   \n"

                : [dout_ptr0] "+r" (dout_ptr0), [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), \
                  [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [out_buf1] "r" (out_buf1), [s_8] "r" (s_8)
                : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
                  "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
                );
                for (int i = 0; i < w_out; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }
                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}

#else

//! larger depthwise, win >= 9;
void conv_depthwise_5x5s2p2(const float* din, float* dout, \
          int num, int ch_out, int h_out, int w_out, \
          int ch_in, int h_in, int w_in, \
          const float* weights, const float* bias, \
          bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    //printf("invoke 5x5s2p2 armv7\n");
    CHECK_GE(w_in, 9) << "only support win >= 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int cnt = (w_out_round - 4) / 4;
    int mid_cnt = cnt - 1;
    int right_start = cnt * 2 * 4 - 2;
    int mask_cnt = 12 - (w_in - right_start);
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float* dout0 = dout_ch;

            const float* weights_c = weights + c * weights_saptial_size;
            float32x4_t w0 = vld1q_f32(weights_c);
            float32x4_t w1 = vld1q_f32(weights_c + 4);
            float32x4_t w2 = vld1q_f32(weights_c + 8);
            float32x4_t w3 = vld1q_f32(weights_c + 12);
            float32x4_t w4 = vld1q_f32(weights_c + 16);
            float32x4_t w5 = vld1q_f32(weights_c + 20);
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c + 24;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                int loop = mid_cnt;
                const int s_8 = 8;
                const int s_16 = 16;

                asm volatile(
                    "vmov.i32   q15, #0x0           \n"
                    "pld [%[din_ptr0]]              \n"
                    "pld [%[din_ptr1]]              \n"
                    "pld [%[din_ptr2]]              \n"
                    "pld [%[din_ptr3]]              \n"
                    "pld [%[din_ptr4]]              \n"
                    "pld [%[mask]]                  \n"

                    // left
                    "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
                    "vld1.32 {d26-d29}, [%[vbias]]              \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vmov.32 q14, q15                           \n"

                    //r0
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
                    "sub %[din_ptr0], #8  \n"

                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r1
                    "vmla.f32 q13, q8, %f[w1][1]                \n"
                    "vmla.f32 q14, q9, %e[w2][0]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
                    "sub %[din_ptr1], #8                        \n"

                    "vmla.f32 q13, q6, %e[w1][1]                \n"
                    "vmla.f32 q14, q7, %f[w1][0]                \n"
                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r2
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
                    "sub %[din_ptr2], #8                        \n"

                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r3
                    "vmla.f32 q13, q8, %e[w4][1]                \n"
                    "vmla.f32 q14, q9, %f[w4][0]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
                    "sub %[din_ptr3], #8                        \n"

                    "vmla.f32 q13, q6, %f[w3][1]                \n"
                    "vmla.f32 q14, q7, %e[w4][0]                \n"
                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r4
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
                    "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
                    "sub %[din_ptr4], #8                        \n"

                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"

                    "vmov.32 q12, %q[w0]                        \n"
                    "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
                    "vmla.f32 q13, q10, %e[w0][0]               \n"
                    "vadd.f32 q13, q13, q14                     \n"
                    "vmov.32 %q[w0], q12                        \n"
                    "cmp %[mid_cnt], #1                         \n"
                    "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
                    "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
                    "pld [%[din_ptr0]]                          \n"
                    "blt 2f                                     \n"

                    // mid
                    "1:                                         \n"
                    "vld1.32 {d26-d27}, [%[vbias]]              \n"
                    "vmov.32 q14, q15                           \n"

                    // r0
                    "vmla.f32 q13, q6,  %e[w0][0]               \n"
                    "vmla.f32 q14, q7,  %e[w0][1]               \n"

                    "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %f[w0][0]               \n"
                    "vmla.f32 q14, q9,  %f[w0][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"

                    // r1
                    "vmla.f32 q13, q6,  %e[w1][1]               \n"
                    "vmla.f32 q14, q7,  %f[w1][0]               \n"
                    "pld [%[din_ptr1]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %f[w1][1]               \n"
                    "vmla.f32 q14, q9,  %e[w2][0]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"

                    // r2
                    "vmla.f32 q13, q6,  %f[w2][0]               \n"
                    "vmla.f32 q14, q7,  %f[w2][1]               \n"
                    "pld [%[din_ptr2]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %e[w3][0]               \n"
                    "vmla.f32 q14, q9,  %e[w3][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"

                    //r3
                    "vmla.f32 q13, q6,  %f[w3][1]               \n"
                    "vmla.f32 q14, q7,  %e[w4][0]               \n"
                    "pld [%[din_ptr3]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %e[w4][1]               \n"
                    "vmla.f32 q14, q9,  %f[w4][0]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"

                    //r4
                    "vmla.f32 q13, q6,  %e[w5][0]               \n"
                    "vmla.f32 q14, q7,  %e[w5][1]               \n"
                    "pld [%[din_ptr4]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
                    "vld1.32 {%e[w0][0]}, [%[weights]]          \n"

                    "vmla.f32 q13, q8,  %f[w5][0]               \n"
                    "vmla.f32 q14, q9,  %f[w5][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w0][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"

                    "vmov.32 %q[w0], q12                        \n"
                    "vadd.f32 q13, q13, q14                     \n"
                    "subs %[mid_cnt], #1                        \n"
                    "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
                    "bne 1b                                     \n"

                    "2:                                         \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d26-d27}, [%[vbias]]              \n"
                    "vmov.32 q14, q15                           \n"

                    //r0
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    // r1
                    "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w1][1]                \n"
                    "vmla.f32 q14, q7, %f[w1][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w1][1]                \n"
                    "vmla.f32 q14, q9, %e[w2][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    //r2
                    "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"

                    "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    //r3
                    "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w3][1]                \n"
                    "vmla.f32 q14, q7, %e[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w4][1]                \n"
                    "vmla.f32 q14, q9, %f[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    //r4
                    "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d12[0]}, [%[weights]]             \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, d12[0]                  \n"

                    "vadd.f32 q13, q13, q14                     \n"
                    "vst1.32 {d26-d27}, [%[out_buf0]]           \n"

                : [dout_ptr0] "+r" (dout_ptr0), [mid_cnt] "+r" (loop), [din_ptr0] "+r" (din_ptr0), \
                  [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), [din_ptr3] "+r" (din_ptr3), \
                  [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [w0] "w" (w0), [w1] "w" (w1), [w2] "w" (w2), [w3] "w" (w3), [w4] "w" (w4), [w5] "w" (w5), \
                  [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [s_8] "r" (s_8), \
                  [s_16] "r" (s_16)
                : "memory", "cc", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );

                int remain_cnt = w_out - (mid_cnt + 1) * 4;
                for (int i = 0; i < remain_cnt; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }

                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}

//! larger depthwise, win >= 9;
void conv_depthwise_5x5s2p2_relu(const float* din, float* dout, \
          int num, int ch_out, int h_out, int w_out, \
          int ch_in, int h_in, int w_in, \
          const float* weights, const float* bias, \
          bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    //printf("invoke 5x5s2p2 armv7\n");
    CHECK_GE(w_in, 9) << "only support win >= 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int cnt = (w_out_round - 4) / 4;
    int mid_cnt = cnt - 1;
    int right_start = cnt * 2 * 4 - 2;
    int mask_cnt = 12 - (w_in - right_start);
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float* dout0 = dout_ch;

            const float* weights_c = weights + c * weights_saptial_size;
            float32x4_t w0 = vld1q_f32(weights_c);
            float32x4_t w1 = vld1q_f32(weights_c + 4);
            float32x4_t w2 = vld1q_f32(weights_c + 8);
            float32x4_t w3 = vld1q_f32(weights_c + 12);
            float32x4_t w4 = vld1q_f32(weights_c + 16);
            float32x4_t w5 = vld1q_f32(weights_c + 20);
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c + 24;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                int loop = mid_cnt;
                const int s_8 = 8;
                const int s_16 = 16;

                asm volatile(
                    "vmov.i32   q15, #0x0           \n"
                    "pld [%[din_ptr0]]              \n"
                    "pld [%[din_ptr1]]              \n"
                    "pld [%[din_ptr2]]              \n"
                    "pld [%[din_ptr3]]              \n"
                    "pld [%[din_ptr4]]              \n"
                    "pld [%[mask]]                  \n"

                    // left
                    "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
                    "vld1.32 {d26-d29}, [%[vbias]]              \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vmov.32 q14, q15                           \n"

                    //r0
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
                    "sub %[din_ptr0], #8  \n"

                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r1
                    "vmla.f32 q13, q8, %f[w1][1]                \n"
                    "vmla.f32 q14, q9, %e[w2][0]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
                    "sub %[din_ptr1], #8                        \n"

                    "vmla.f32 q13, q6, %e[w1][1]                \n"
                    "vmla.f32 q14, q7, %f[w1][0]                \n"
                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r2
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
                    "sub %[din_ptr2], #8                        \n"

                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r3
                    "vmla.f32 q13, q8, %e[w4][1]                \n"
                    "vmla.f32 q14, q9, %f[w4][0]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
                    "sub %[din_ptr3], #8                        \n"

                    "vmla.f32 q13, q6, %f[w3][1]                \n"
                    "vmla.f32 q14, q7, %e[w4][0]                \n"
                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"

                    //r4
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
                    "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
                    "sub %[din_ptr4], #8                        \n"

                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"

                    "vmov.32 q12, %q[w0]                        \n"
                    "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
                    "vmla.f32 q13, q10, %e[w0][0]               \n"
                    "vadd.f32 q13, q13, q14                     \n"
                    "vmov.f32 %q[w0], q12                        \n"
                    "vmax.f32 q13, q13, q15                     \n"
                    "cmp %[mid_cnt], #1                         \n"
                    "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
                    "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
                    "pld [%[din_ptr0]]                          \n"
                    "blt 2f                                     \n"

                    // mid
                    "1:                                         \n"
                    "vld1.32 {d26-d27}, [%[vbias]]              \n"
                    "vmov.32 q14, q15                           \n"

                    // r0
                    "vmla.f32 q13, q6,  %e[w0][0]               \n"
                    "vmla.f32 q14, q7,  %e[w0][1]               \n"

                    "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %f[w0][0]               \n"
                    "vmla.f32 q14, q9,  %f[w0][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"

                    // r1
                    "vmla.f32 q13, q6,  %e[w1][1]               \n"
                    "vmla.f32 q14, q7,  %f[w1][0]               \n"
                    "pld [%[din_ptr1]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %f[w1][1]               \n"
                    "vmla.f32 q14, q9,  %e[w2][0]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"

                    // r2
                    "vmla.f32 q13, q6,  %f[w2][0]               \n"
                    "vmla.f32 q14, q7,  %f[w2][1]               \n"
                    "pld [%[din_ptr2]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %e[w3][0]               \n"
                    "vmla.f32 q14, q9,  %e[w3][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"

                    //r3
                    "vmla.f32 q13, q6,  %f[w3][1]               \n"
                    "vmla.f32 q14, q7,  %e[w4][0]               \n"
                    "pld [%[din_ptr3]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"

                    "vmla.f32 q13, q8,  %e[w4][1]               \n"
                    "vmla.f32 q14, q9,  %f[w4][0]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"

                    //r4
                    "vmla.f32 q13, q6,  %e[w5][0]               \n"
                    "vmla.f32 q14, q7,  %e[w5][1]               \n"
                    "pld [%[din_ptr4]]                          \n"

                    "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
                    "vld1.32 {%e[w0][0]}, [%[weights]]          \n"

                    "vmla.f32 q13, q8,  %f[w5][0]               \n"
                    "vmla.f32 q14, q9,  %f[w5][1]               \n"

                    "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"

                    "vmla.f32 q13, q10, %e[w0][0]               \n"

                    "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"

                    "vmov.32 %q[w0], q12                        \n"
                    "vadd.f32 q13, q13, q14                     \n"
                    "vmax.f32 q13, q13, q15                     \n"
                    "subs %[mid_cnt], #1                        \n"
                    "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
                    "bne 1b                                     \n"

                    "2:                                         \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d26-d27}, [%[vbias]]              \n"
                    "vmov.32 q14, q15                           \n"

                    //r0
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    // r1
                    "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w1][1]                \n"
                    "vmla.f32 q14, q7, %f[w1][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w1][1]                \n"
                    "vmla.f32 q14, q9, %e[w2][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w2][1]               \n"

                    //r2
                    "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"

                    "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    //r3
                    "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w3][1]                \n"
                    "vmla.f32 q14, q7, %e[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w4][1]                \n"
                    "vmla.f32 q14, q9, %f[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "sub %[mask], #16                           \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w4][1]               \n"

                    //r4
                    "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d12[0]}, [%[weights]]             \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, d12[0]                  \n"

                    "vadd.f32 q13, q13, q14                     \n"
                    "vmax.f32 q13, q13, q15                     \n"
                    "vst1.32 {d26-d27}, [%[out_buf0]]           \n"

                : [dout_ptr0] "+r" (dout_ptr0), [mid_cnt] "+r" (loop), [din_ptr0] "+r" (din_ptr0), \
                  [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), [din_ptr3] "+r" (din_ptr3), \
                  [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [w0] "w" (w0), [w1] "w" (w1), [w2] "w" (w2), [w3] "w" (w3), [w4] "w" (w4), [w5] "w" (w5), \
                  [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [s_8] "r" (s_8), \
                  [s_16] "r" (s_16)
                : "memory", "cc", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );

                int remain_cnt = w_out - (mid_cnt + 1) * 4;
                for (int i = 0; i < remain_cnt; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }

                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}

//! small depthwise, win < 9;
void conv_depthwise_5x5s2p2_s(const float* din, float* dout, \
      int num, int ch_out, int h_out, int w_out, \
      int ch_in, int h_in, int w_in, \
      const float* weights, const float* bias, \
      bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_LT(w_in, 9) << "only support win < 9";
    int w_out_round = (w_out + 3) / 4 * 4;
    int mask_cnt = 12 - w_in - 2;
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights + c * weights_saptial_size;
            float32x4_t w0 = vld1q_f32(weights_c);
            float32x4_t w1 = vld1q_f32(weights_c + 4);
            float32x4_t w2 = vld1q_f32(weights_c + 8);
            float32x4_t w3 = vld1q_f32(weights_c + 12);
            float32x4_t w4 = vld1q_f32(weights_c + 16);
            float32x4_t w5 = vld1q_f32(weights_c + 20);
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c + 24;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                const int s_8 = 8;

                asm volatile(
                    "vmov.i32  q15, #0x0                 \n"
                    "pld [%[din_ptr0]]                   \n"
                    "pld [%[din_ptr1]]                   \n"
                    "pld [%[din_ptr2]]                   \n"
                    "pld [%[din_ptr3]]                   \n"
                    "pld [%[din_ptr4]]                   \n"
                    "vld1.32 {d26-d27}, [%[vbias]]       \n"
                    "vmov.32 q14, q15                    \n"
                    "vld2.32 {d16-d19}, [%[din_ptr0]]!   \n"

                    //r0
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    //r1
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q14, q6, %e[w1][1]                \n"
                    "vmla.f32 q13, q7, %f[w1][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q14, q8, %f[w1][1]                \n"
                    "vmla.f32 q13, q9, %e[w2][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q14, q10, %e[w2][1]               \n"

                    //r2
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    //r3
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q14, q6, %f[w3][1]                \n"
                    "vmla.f32 q13, q7, %e[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q14, q8, %e[w4][1]                \n"
                    "vmla.f32 q13, q9, %f[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q14, q10, %f[w4][1]               \n"

                    //r4
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d12[0]}, [%[weights]]             \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, d12[0]                  \n"

                    "vadd.f32 q13, q13, q14                     \n"
                    "vst1.32 {d26-d27}, [%[out_buf0]]           \n"

                : [dout_ptr0] "+r" (dout_ptr0), [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), \
                  [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [s_8] "r" (s_8), [w0] "w" (w0), [w1] "w" (w1),\
                  [w2] "w" (w2), [w3] "w" (w3), [w4] "w" (w4), [w5] "w" (w5)
                : "memory", "cc", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                for (int i = 0; i < w_out; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }
                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}

//! small depthwise, win < 9;
void conv_depthwise_5x5s2p2_relu_s(const float* din, float* dout, \
      int num, int ch_out, int h_out, int w_out, \
      int ch_in, int h_in, int w_in, \
      const float* weights, const float* bias, \
      bool flag_bias, bool flag_relu, Context<ARM>* ctx){

    CHECK_LT(w_in, 9) << "only support win < 9\n";
    int w_out_round = (w_out + 3) / 4 * 4;
    int mask_cnt = 12 - w_in - 2;
    int mask[12] ;
    memset(mask, 0xff, 12 * sizeof(int));
    for (int i = 0; i < mask_cnt; ++i){
        mask[11 - i] = 0;
    }
    float* zero_ptr = static_cast<float*>(ctx->get_work_space());
    memset(zero_ptr, 0, w_in * sizeof(float));
    int in_spatial_size = w_in * h_in;
    int out_spatial_size = w_out * h_out;
    int weights_saptial_size = 25;

    for (int n = 0; n < num; ++n){
        const float* din_batch = din + n * in_spatial_size * ch_in;
        float* dout_batch = dout + n * out_spatial_size * ch_out;
#pragma omp parallel for
        for (int c = 0; c < ch_in; ++c){
            const float* din_ch = din_batch + c * in_spatial_size;
            float* dout_ch = dout_batch + c * out_spatial_size;
            const float* din0 = zero_ptr;
            const float* din1 = zero_ptr;
            const float* din2 = din_ch;
            const float* din3 = din2 + w_in;
            const float* din4 = din3 + w_in;

            float out_buf0[4];
            float out_buf1[4];
            float* dout0 = dout_ch;
            float* dout1 = dout0 + w_out;

            const float* weights_c = weights + c * weights_saptial_size;
            float32x4_t w0 = vld1q_f32(weights_c);
            float32x4_t w1 = vld1q_f32(weights_c + 4);
            float32x4_t w2 = vld1q_f32(weights_c + 8);
            float32x4_t w3 = vld1q_f32(weights_c + 12);
            float32x4_t w4 = vld1q_f32(weights_c + 16);
            float32x4_t w5 = vld1q_f32(weights_c + 20);
            for (int h = 0; h < h_out; h += 1){
                //! (h * 2 - 2) + 4 > h_in - 1
                if (h * 2 + 3 > h_in){
                    switch (h * 2 + 3 - h_in){
                        case 4:
                            din1 = zero_ptr;
                        case 3:
                            din2 = zero_ptr;
                        case 2:
                            din3 = zero_ptr;
                        case 1:
                            din4 = zero_ptr;
                        default:
                            break;
                    }
                }
                const float* din_ptr0 = din0;
                const float* din_ptr1 = din1;
                const float* din_ptr2 = din2;
                const float* din_ptr3 = din3;
                const float* din_ptr4 = din4;

                const float* weights_ptr = weights_c + 24;
                float* dout_ptr0 = dout0;

                float bias_c = 0.f;
                if (flag_bias){
                    bias_c = bias[c];
                }
                float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
                int* mask_ptr = mask;
                const int s_8 = 8;

                asm volatile(
                    "vmov.i32  q15, #0x0                \n"
                    "pld [%[din_ptr0]]                  \n"
                    "pld [%[din_ptr1]]                  \n"
                    "pld [%[din_ptr2]]                  \n"
                    "pld [%[din_ptr3]]                  \n"
                    "pld [%[din_ptr4]]                  \n"
                    "vld1.32 {d26-d27}, [%[vbias]]      \n"
                    "vmov.32 q14, q15                   \n"
                    "vld2.32 {d16-d19}, [%[din_ptr0]]!  \n"

                    //r0
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w0][0]                \n"
                    "vmla.f32 q14, q7, %e[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w0][0]                \n"
                    "vmla.f32 q14, q9, %f[w0][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %e[w1][0]               \n"

                    //r1
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q14, q6, %e[w1][1]                \n"
                    "vmla.f32 q13, q7, %f[w1][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q14, q8, %f[w1][1]                \n"
                    "vmla.f32 q13, q9, %e[w2][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q14, q10, %e[w2][1]               \n"

                    //r2
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %f[w2][0]                \n"
                    "vmla.f32 q14, q7, %f[w2][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %e[w3][0]                \n"
                    "vmla.f32 q14, q9, %e[w3][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, %f[w3][0]               \n"

                    //r3
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q14, q6, %f[w3][1]                \n"
                    "vmla.f32 q13, q7, %e[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q14, q8, %e[w4][1]                \n"
                    "vmla.f32 q13, q9, %f[w4][0]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
                    "sub %[mask], #16                           \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q14, q10, %f[w4][1]               \n"

                    //r4
                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vext.32 q6, q15, q8, #3                    \n"
                    "vext.32 q7, q15, q9, #3                    \n"
                    "vext.32 q10, q8, q15, #1                   \n"
                    "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"

                    "vbif.32 q6, q15, q11                       \n"
                    "vbif.32 q7, q15, q12                       \n"
                    "vmla.f32 q13, q6, %e[w5][0]                \n"
                    "vmla.f32 q14, q7, %e[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
                    "vld1.32 {d12[0]}, [%[weights]]             \n"
                    "vbif.32 q8, q15, q11                       \n"
                    "vbif.32 q9, q15, q12                       \n"
                    "vmla.f32 q13, q8, %f[w5][0]                \n"
                    "vmla.f32 q14, q9, %f[w5][1]                \n"

                    "vld2.32 {d22-d25}, [%[mask]]               \n"
                    "vbif.32 q10, q15, q11                      \n"
                    "vmla.f32 q13, q10, d12[0]                  \n"

                    "vadd.f32 q13, q13, q14                     \n"
                    "vmax.f32 q13, q13, q15                     \n"
                    "vst1.32 {d26-d27}, [%[out_buf0]]           \n"

                : [dout_ptr0] "+r" (dout_ptr0), [din_ptr0] "+r" (din_ptr0), [din_ptr1] "+r" (din_ptr1), [din_ptr2] "+r" (din_ptr2), \
                  [din_ptr3] "+r" (din_ptr3), [din_ptr4] "+r" (din_ptr4), [mask] "+r" (mask_ptr), [weights] "+r" (weights_ptr)
                : [vbias] "r" (vbias), [out_buf0] "r" (out_buf0), [s_8] "r" (s_8), [w0] "w" (w0), [w1] "w" (w1),\
                  [w2] "w" (w2), [w3] "w" (w3), [w4] "w" (w4), [w5] "w" (w5)
                : "memory", "cc", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
                );
                for (int i = 0; i < w_out; ++i){
                    dout_ptr0[i] = out_buf0[i];
                }
                din0 = din2;
                din1 = din3;
                din2 = din4;
                din3 = din2 + w_in;
                din4 = din3 + w_in;
                dout0 += w_out;
            }
        }
    }
}
#endif //__aarch64__

} //namespace saber
} //namespace anakin

#endif //USE_ARM_PLACE
